*===============================================================================
*
*					WORKER BELIEFS ABOUT OUTSIDE OPTIONS
*		(c)	Simon Jaeger, Christopher Roth, Nina Roussille, Benjamin Schoefer
*							  2023 December 5
*						   	   SOEP-IAB Data 
*
*===============================================================================


********************************************************************************
*								Machinelearning	 							   *
********************************************************************************

cap log close                     
log using ${log}/5_machinelearning.log, replace       
set seed 6000

* setting matsize big enough to deal with a very long list of covariates
set matsize 5000



*** creating dataset of EUE coworker moves with relevant covariates, for the ML prediction calculations	

* getting list of IDs of the EUE movers whose characteristics we need
use "$data/machinelearning_unempFT.dta", clear // this file is saved by 2_coworker_wagechanges, when it's building a dataset of EUE coworker moves

keep prs_id

* making sure this is unique by ID
sort prs_id
quietly by prs_id: gen dup = cond(_N==1,0,_n)
drop if dup>1
drop dup

save "$data/machinelearning_ids.dta", replace

* starting to build up the list of covariates
global covlist1 "ln_wage firm_effect age1 age2 age3 tenure1 tenure2 tenure3 female firmsize1 firmsize2 firmsize3 separation_rate sdwages growth"

* loading full IAB dataset. This dataset extends back to 2000 because, even though we only use moves that happen in 2015-2019, we need historical data to calculate tenure and other variables
use prs_id betnr ieb_beg_epi ieb_end_epi ieb_tag_entg ieb_quellverf_id ieb_beruf_kons_num b_gem_schl_ao_akt_num wz08_kons_num ieb_dba_id geb_dat sex_id using "$orig/coworker_fullhist_IEB.dta", clear

* identifying GSOEP individuals
getlink, mergevar(prs_id)

gen gsoep = pid!=.
drop if _merge==2
drop _merge 
rename pid soep_pid

* restricting to workplaces containing someone belonging to our mover dataset OR a SOEP person
* we need the movers to calculate the ML coefficients and the SOEP people to calculate the predictions from the coefficients
di "machinelearning_ids merge"
merge m:1 prs_id using "$data/machinelearning_ids.dta", keep(master match)
gen moversample = _merge==3
egen maxdummy = max(moversample), by(betnr)

egen maxdummy2 = max(gsoep), by(betnr)
replace maxdummy = 1 if maxdummy2==1

keep if maxdummy==1
drop maxdummy _merge maxdummy2

* year
gen jahr = year(ieb_beg_epi)

* keeping only employment spells
keep if betnr!=.

* age
gen yob = year(geb_dat)
gen age = jahr-yob

* dropping under-16s
drop if age<16

* log wage, for GSOEP movers
gen ln_wage_non_winsorized = ln(ieb_tag_entg)
winsor ln_wage_non_winsorized, p(0.02) generate(ln_wage)
drop ln_wage_non_winsorized
rename ln_wage ln_wage_gsoep

* making dataset unique by ID-firm-year (note this is NOT narrowing down to main spells)
egen maxwage = max(ln_wage_gsoep), by(prs_id betnr jahr)
drop if ln_wage_gsoep!=maxwage
drop maxwage
sort prs_id betnr jahr
quietly by prs_id betnr jahr: gen dup = cond(_N==1,0,_n)
drop if dup>1
drop dup

* designating one member of each firm-year to perform cross-year within-firm averages
sort betnr jahr
quietly by betnr jahr: gen dup = cond(_N==1,0,_n)
gen designated = dup==0 | dup==1

* firm size (number of employees) polynomial
cap drop dummy
gen dummy = 1
egen sizeyear = sum(dummy), by(betnr jahr)
replace sizeyear = . if designated!=1
egen firmsize1 = mean(sizeyear), by(betnr)
gen firmsize2 = firmsize1^2
gen firmsize3 = firmsize1^3
drop dummy

* separation rate (percent of firm's employment in year t not observed in year t-1)
sort prs_id betnr jahr
bys prs_id (jahr): gen observed = betnr[_n+1]==betnr & prs_id==prs_id[_n+1]
gen not_observed = observed==0
replace not_observed = . if jahr==2019 
egen sepyear = mean(not_observed), by(betnr jahr)
replace sepyear = . if designated!=1
egen separation_rate = mean(sepyear), by(betnr) 
drop observed not_observed

* mean of log wages in this year
egen meanwageyear = mean(ln_wage_gsoep), by(betnr jahr)

* within-firm SD of log wages
egen sdwageyear = sd(ln_wage_gsoep), by(betnr jahr)
replace sdwageyear = . if designated!=1
egen sdwages = mean(sdwageyear), by(betnr)

* firm employment growth rate across 2015-2019
forvalues year = 2015/2019 {

	gen dummy = jahr==`year'
	gen tempsize`year' = dummy*sizeyear
	
	egen size`year' = max(tempsize`year'), by(betnr)
	replace size`year' = . if size`year'==0
	drop dummy tempsize`year'

}
gen growthtemp = .
forvalues year = 2016/2019 {
	
	local lastyear = `year'-1
	gen growth`year' = (size`year'-size`lastyear')/size`lastyear'
	
	replace growth`year' = . if designated!=1
	replace growthtemp = growth`year' if jahr==`year'

	drop growth`year' size`lastyear'
}
drop size2019

egen growth_nw = mean(growthtemp), by(betnr)
winsor growth_nw, gen(growth) p(0.02)

* sex
rename sex_id female

*** now, after calculating firm-level covariates, we can keep only the relevant sample of people
*** this reduces the sample size and makes the rest of the file run a bit faster

* restricting to people with nonmissing observations in the 2015-2019 range
gen dummy = inrange(jahr,2015,2019)
egen maxdummy = max(dummy), by(prs_id)
keep if maxdummy==1
drop dummy maxdummy

* education
rename ieb_dba_id ausbildung
gen educ = .
replace educ = 1 if ausbildung==1
replace educ = 2 if ausbildung==2
replace educ = 3 if ausbildung==11 | ausbildung==12

tab educ, m
tab ausbildung, m

replace educ = 4 if educ==.

* generating dummies
forvalues n = 1/4 {
	gen ed`n' = educ==`n'
	global covlist1 "$covlist1 ed`n'"
}

* region
replace b_gem_schl_ao_akt_num = . if b_gem_schl_ao_akt_num<0 // places outside Germany
rename b_gem_schl_ao_akt_num ao_kreis

// crosswalking to lm_region

gen int lm_region = . 
label variable lm_region "Kropp/Schwengler Region"

//transform ao_kreis from gemeinde-level to kreis-level info
replace ao_kreis = floor(ao_kreis/1000)

// Perform crosswalk
replace lm_region = 	2000000	if ao_kreis==	1001
replace lm_region = 	2000000	if ao_kreis==	1002
replace lm_region = 	2000000	if ao_kreis==	1003
replace lm_region = 	2000000	if ao_kreis==	1004
replace lm_region = 	2000000	if ao_kreis==	1051
replace lm_region = 	2000000	if ao_kreis==	1053
replace lm_region = 	2000000	if ao_kreis==	1054
replace lm_region = 	2000000	if ao_kreis==	1055
replace lm_region = 	2000000	if ao_kreis==	1056
replace lm_region = 	2000000	if ao_kreis==	1057
replace lm_region = 	2000000	if ao_kreis==	1058
replace lm_region = 	2000000	if ao_kreis==	1059
replace lm_region = 	2000000	if ao_kreis==	1060
replace lm_region = 	2000000	if ao_kreis==	1061
replace lm_region = 	2000000	if ao_kreis==	1062
replace lm_region = 	2000000	if ao_kreis==	2000
replace lm_region = 	3101000	if ao_kreis==	3101
replace lm_region = 	3101000	if ao_kreis==	3102
replace lm_region = 	3101000	if ao_kreis==	3103
replace lm_region = 	3101000	if ao_kreis==	3151
replace lm_region = 	3152012	if ao_kreis==	3152
replace lm_region = 	3101000	if ao_kreis==	3153
replace lm_region = 	3101000	if ao_kreis==	3154
replace lm_region = 	3152012	if ao_kreis==	3155
replace lm_region = 	3152012	if ao_kreis==	3156
replace lm_region = 	3101000	if ao_kreis==	3157
replace lm_region = 	3101000	if ao_kreis==	3158
replace lm_region = 	3241001	if ao_kreis==	3241
replace lm_region = 	4011000	if ao_kreis==	3251
replace lm_region = 	3241001	if ao_kreis==	3252
replace lm_region = 	3241001	if ao_kreis==	3254
replace lm_region = 	5711000	if ao_kreis==	3255
replace lm_region = 	3241001	if ao_kreis==	3256
replace lm_region = 	3241001	if ao_kreis==	3257
replace lm_region = 	3241001	if ao_kreis==	3351
replace lm_region = 	4011000	if ao_kreis==	3352
replace lm_region = 	2000000	if ao_kreis==	3353
replace lm_region = 	2000000	if ao_kreis==	3354
replace lm_region = 	2000000	if ao_kreis==	3355
replace lm_region = 	4011000	if ao_kreis==	3356
replace lm_region = 	4011000	if ao_kreis==	3357
replace lm_region = 	3241001	if ao_kreis==	3358
replace lm_region = 	2000000	if ao_kreis==	3359
replace lm_region = 	2000000	if ao_kreis==	3360
replace lm_region = 	4011000	if ao_kreis==	3361
replace lm_region = 	4011000	if ao_kreis==	3401
replace lm_region = 	3403000	if ao_kreis==	3402
replace lm_region = 	3403000	if ao_kreis==	3403
replace lm_region = 	3404000	if ao_kreis==	3404
replace lm_region = 	3403000	if ao_kreis==	3405
replace lm_region = 	3403000	if ao_kreis==	3451
replace lm_region = 	3403000	if ao_kreis==	3452
replace lm_region = 	3403000	if ao_kreis==	3453
replace lm_region = 	5515000	if ao_kreis==	3454
replace lm_region = 	3403000	if ao_kreis==	3455
replace lm_region = 	5515000	if ao_kreis==	3456
replace lm_region = 	3403000	if ao_kreis==	3457
replace lm_region = 	4011000	if ao_kreis==	3458
replace lm_region = 	3404000	if ao_kreis==	3459
replace lm_region = 	3404000	if ao_kreis==	3460
replace lm_region = 	3403000	if ao_kreis==	3461
replace lm_region = 	3403000	if ao_kreis==	3462
replace lm_region = 	4011000	if ao_kreis==	4011
replace lm_region = 	4011000	if ao_kreis==	4012
replace lm_region = 	5113000	if ao_kreis==	5111
replace lm_region = 	5113000	if ao_kreis==	5112
replace lm_region = 	5113000	if ao_kreis==	5113
replace lm_region = 	5113000	if ao_kreis==	5114
replace lm_region = 	5113000	if ao_kreis==	5116
replace lm_region = 	5113000	if ao_kreis==	5117
replace lm_region = 	5113000	if ao_kreis==	5119
replace lm_region = 	5113000	if ao_kreis==	5120
replace lm_region = 	5113000	if ao_kreis==	5122
replace lm_region = 	5113000	if ao_kreis==	5124
replace lm_region = 	5113000	if ao_kreis==	5154
replace lm_region = 	5113000	if ao_kreis==	5158
replace lm_region = 	5113000	if ao_kreis==	5162
replace lm_region = 	5113000	if ao_kreis==	5166
replace lm_region = 	5113000	if ao_kreis==	5170
replace lm_region = 	5313000	if ao_kreis==	5313
replace lm_region = 	5315000	if ao_kreis==	5314
replace lm_region = 	5315000	if ao_kreis==	5315
replace lm_region = 	5315000	if ao_kreis==	5316
replace lm_region = 	5313000	if ao_kreis==	5354
replace lm_region = 	5313000	if ao_kreis==	5334
replace lm_region = 	5313000	if ao_kreis==	5358
replace lm_region = 	5315000	if ao_kreis==	5362
replace lm_region = 	5315000	if ao_kreis==	5366
replace lm_region = 	5113000	if ao_kreis==	5370
replace lm_region = 	5315000	if ao_kreis==	5374
replace lm_region = 	5315000	if ao_kreis==	5378
replace lm_region = 	5315000	if ao_kreis==	5382
replace lm_region = 	5113000	if ao_kreis==	5512
replace lm_region = 	5113000	if ao_kreis==	5513
replace lm_region = 	5515000	if ao_kreis==	5515
replace lm_region = 	5515000	if ao_kreis==	5554
replace lm_region = 	5515000	if ao_kreis==	5558
replace lm_region = 	5113000	if ao_kreis==	5562
replace lm_region = 	5515000	if ao_kreis==	5566
replace lm_region = 	5113000	if ao_kreis==	5570
replace lm_region = 	5711000	if ao_kreis==	5711
replace lm_region = 	5711000	if ao_kreis==	5754
replace lm_region = 	5711000	if ao_kreis==	5758
replace lm_region = 	5711000	if ao_kreis==	5762
replace lm_region = 	5711000	if ao_kreis==	5766
replace lm_region = 	5711000	if ao_kreis==	5770
replace lm_region = 	5711000	if ao_kreis==	5774
replace lm_region = 	5113000	if ao_kreis==	5911
replace lm_region = 	5113000	if ao_kreis==	5913
replace lm_region = 	5113000	if ao_kreis==	5914
replace lm_region = 	5113000	if ao_kreis==	5915
replace lm_region = 	5113000	if ao_kreis==	5916
replace lm_region = 	5113000	if ao_kreis==	5954
replace lm_region = 	5113000	if ao_kreis==	5958
replace lm_region = 	5113000	if ao_kreis==	5962
replace lm_region = 	5970040	if ao_kreis==	5966
replace lm_region = 	5970040	if ao_kreis==	5970
replace lm_region = 	5711000	if ao_kreis==	5974
replace lm_region = 	5113000	if ao_kreis==	5978
replace lm_region = 	6412000	if ao_kreis==	6411
replace lm_region = 	6412000	if ao_kreis==	6412
replace lm_region = 	6412000	if ao_kreis==	6413
replace lm_region = 	6412000	if ao_kreis==	6414
replace lm_region = 	8222000	if ao_kreis==	6431
replace lm_region = 	6412000	if ao_kreis==	6432
replace lm_region = 	6412000	if ao_kreis==	6433
replace lm_region = 	6412000	if ao_kreis==	6434
replace lm_region = 	6412000	if ao_kreis==	6435
replace lm_region = 	6412000	if ao_kreis==	6436
replace lm_region = 	6412000	if ao_kreis==	6437
replace lm_region = 	6412000	if ao_kreis==	6438
replace lm_region = 	6412000	if ao_kreis==	6439
replace lm_region = 	6412000	if ao_kreis==	6440
replace lm_region = 	6412000	if ao_kreis==	6531
replace lm_region = 	6412000	if ao_kreis==	6532
replace lm_region = 	6412000	if ao_kreis==	6533
replace lm_region = 	6412000	if ao_kreis==	6534
replace lm_region = 	6412000	if ao_kreis==	6535
replace lm_region = 	6611000	if ao_kreis==	6611
replace lm_region = 	6412000	if ao_kreis==	6631
replace lm_region = 	6611000	if ao_kreis==	6632
replace lm_region = 	6611000	if ao_kreis==	6633
replace lm_region = 	6611000	if ao_kreis==	6634
replace lm_region = 	6611000	if ao_kreis==	6635
replace lm_region = 	6611000	if ao_kreis==	6636
replace lm_region = 	7111000	if ao_kreis==	7111
replace lm_region = 	5315000	if ao_kreis==	7131
replace lm_region = 	5970040	if ao_kreis==	7132
replace lm_region = 	6412000	if ao_kreis==	7133
replace lm_region = 	6412000	if ao_kreis==	7134
replace lm_region = 	7111000	if ao_kreis==	7135
replace lm_region = 	7111000	if ao_kreis==	7137
replace lm_region = 	7111000	if ao_kreis==	7138
replace lm_region = 	7111000	if ao_kreis==	7140
replace lm_region = 	6412000	if ao_kreis==	7141
replace lm_region = 	7111000	if ao_kreis==	7143
replace lm_region = 	7211000	if ao_kreis==	7211
replace lm_region = 	7211000	if ao_kreis==	7231
replace lm_region = 	7211000	if ao_kreis==	7232
replace lm_region = 	7211000	if ao_kreis==	7233
replace lm_region = 	7211000	if ao_kreis==	7235
replace lm_region = 	8222000	if ao_kreis==	7311
replace lm_region = 	10041100	if ao_kreis==	7312
replace lm_region = 	8222000	if ao_kreis==	7313
replace lm_region = 	8222000	if ao_kreis==	7314
replace lm_region = 	6412000	if ao_kreis==	7315
replace lm_region = 	8222000	if ao_kreis==	7316
replace lm_region = 	10041100	if ao_kreis==	7317
replace lm_region = 	8222000	if ao_kreis==	7318
replace lm_region = 	8222000	if ao_kreis==	7319
replace lm_region = 	10041100	if ao_kreis==	7320
replace lm_region = 	6412000	if ao_kreis==	7331
replace lm_region = 	8222000	if ao_kreis==	7332
replace lm_region = 	8222000	if ao_kreis==	7333
replace lm_region = 	8212000	if ao_kreis==	7334
replace lm_region = 	10041100	if ao_kreis==	7335
replace lm_region = 	10041100	if ao_kreis==	7336
replace lm_region = 	8222000	if ao_kreis==	7337
replace lm_region = 	8222000	if ao_kreis==	7338
replace lm_region = 	6412000	if ao_kreis==	7339
replace lm_region = 	10041100	if ao_kreis==	7340
replace lm_region = 	8111000	if ao_kreis==	8111
replace lm_region = 	8111000	if ao_kreis==	8115
replace lm_region = 	8111000	if ao_kreis==	8116
replace lm_region = 	8111000	if ao_kreis==	8117
replace lm_region = 	8111000	if ao_kreis==	8118
replace lm_region = 	8111000	if ao_kreis==	8119
replace lm_region = 	8111000	if ao_kreis==	8121
replace lm_region = 	8111000	if ao_kreis==	8125
replace lm_region = 	8111000	if ao_kreis==	8126
replace lm_region = 	8111000	if ao_kreis==	8127
replace lm_region = 	8111000	if ao_kreis==	8128
replace lm_region = 	8111000	if ao_kreis==	8135
replace lm_region = 	8111000	if ao_kreis==	8136
replace lm_region = 	8212000	if ao_kreis==	8211
replace lm_region = 	8212000	if ao_kreis==	8212
replace lm_region = 	8212000	if ao_kreis==	8215
replace lm_region = 	8212000	if ao_kreis==	8216
replace lm_region = 	8222000	if ao_kreis==	8221
replace lm_region = 	8222000	if ao_kreis==	8222
replace lm_region = 	8111000	if ao_kreis==	8225
replace lm_region = 	8222000	if ao_kreis==	8226
replace lm_region = 	8111000	if ao_kreis==	8231
replace lm_region = 	8111000	if ao_kreis==	8235
replace lm_region = 	8111000	if ao_kreis==	8236
replace lm_region = 	8111000	if ao_kreis==	8237
replace lm_region = 	8311000	if ao_kreis==	8311
replace lm_region = 	8311000	if ao_kreis==	8315
replace lm_region = 	8311000	if ao_kreis==	8316
replace lm_region = 	8317096	if ao_kreis==	8317
replace lm_region = 	8326074	if ao_kreis==	8325
replace lm_region = 	8326074	if ao_kreis==	8326
replace lm_region = 	8326074	if ao_kreis==	8327
replace lm_region = 	8335075	if ao_kreis==	8335
replace lm_region = 	8336050	if ao_kreis==	8336
replace lm_region = 	8336050	if ao_kreis==	8337
replace lm_region = 	8111000	if ao_kreis==	8415
replace lm_region = 	8111000	if ao_kreis==	8416
replace lm_region = 	8111000	if ao_kreis==	8417
replace lm_region = 	8421000	if ao_kreis==	8421
replace lm_region = 	8421000	if ao_kreis==	8425
replace lm_region = 	8421000	if ao_kreis==	8426
replace lm_region = 	8436064	if ao_kreis==	8435
replace lm_region = 	8436064	if ao_kreis==	8436
replace lm_region = 	8421000	if ao_kreis==	8437
replace lm_region = 	9162000	if ao_kreis==	9161
replace lm_region = 	9162000	if ao_kreis==	9162
replace lm_region = 	9162000	if ao_kreis==	9163
replace lm_region = 	9162000	if ao_kreis==	9171
replace lm_region = 	9162000	if ao_kreis==	9172
replace lm_region = 	9162000	if ao_kreis==	9173
replace lm_region = 	9162000	if ao_kreis==	9174
replace lm_region = 	9162000	if ao_kreis==	9175
replace lm_region = 	9162000	if ao_kreis==	9176
replace lm_region = 	9162000	if ao_kreis==	9177
replace lm_region = 	9162000	if ao_kreis==	9178
replace lm_region = 	9162000	if ao_kreis==	9179
replace lm_region = 	9162000	if ao_kreis==	9180
replace lm_region = 	9162000	if ao_kreis==	9181
replace lm_region = 	9162000	if ao_kreis==	9182
replace lm_region = 	9162000	if ao_kreis==	9183
replace lm_region = 	9162000	if ao_kreis==	9184
replace lm_region = 	9162000	if ao_kreis==	9185
replace lm_region = 	9162000	if ao_kreis==	9186
replace lm_region = 	9162000	if ao_kreis==	9187
replace lm_region = 	9162000	if ao_kreis==	9188
replace lm_region = 	9162000	if ao_kreis==	9189
replace lm_region = 	9162000	if ao_kreis==	9190
replace lm_region = 	9162000	if ao_kreis==	9261
replace lm_region = 	9262000	if ao_kreis==	9262
replace lm_region = 	9362000	if ao_kreis==	9263
replace lm_region = 	9262000	if ao_kreis==	9271
replace lm_region = 	9262000	if ao_kreis==	9272
replace lm_region = 	9362000	if ao_kreis==	9273
replace lm_region = 	9162000	if ao_kreis==	9274
replace lm_region = 	9262000	if ao_kreis==	9275
replace lm_region = 	9262000	if ao_kreis==	9276
replace lm_region = 	9162000	if ao_kreis==	9277
replace lm_region = 	9362000	if ao_kreis==	9278
replace lm_region = 	9162000	if ao_kreis==	9279
replace lm_region = 	9564000	if ao_kreis==	9361
replace lm_region = 	9362000	if ao_kreis==	9362
replace lm_region = 	9363000	if ao_kreis==	9363
replace lm_region = 	9564000	if ao_kreis==	9371
replace lm_region = 	9362000	if ao_kreis==	9372
replace lm_region = 	9564000	if ao_kreis==	9373
replace lm_region = 	9363000	if ao_kreis==	9374
replace lm_region = 	9362000	if ao_kreis==	9375
replace lm_region = 	9362000	if ao_kreis==	9376
replace lm_region = 	9479136	if ao_kreis==	9377
replace lm_region = 	9564000	if ao_kreis==	9461
replace lm_region = 	9462000	if ao_kreis==	9462
replace lm_region = 	9463000	if ao_kreis==	9463
replace lm_region = 	9464000	if ao_kreis==	9464
replace lm_region = 	9564000	if ao_kreis==	9471
replace lm_region = 	9462000	if ao_kreis==	9472
replace lm_region = 	9463000	if ao_kreis==	9473
replace lm_region = 	9564000	if ao_kreis==	9474
replace lm_region = 	9464000	if ao_kreis==	9475
replace lm_region = 	9463000	if ao_kreis==	9476
replace lm_region = 	9462000	if ao_kreis==	9477
replace lm_region = 	9463000	if ao_kreis==	9478
replace lm_region = 	9479136	if ao_kreis==	9479
replace lm_region = 	9564000	if ao_kreis==	9561
replace lm_region = 	9564000	if ao_kreis==	9562
replace lm_region = 	9564000	if ao_kreis==	9563
replace lm_region = 	9564000	if ao_kreis==	9564
replace lm_region = 	9564000	if ao_kreis==	9565
replace lm_region = 	9564000	if ao_kreis==	9571
replace lm_region = 	9564000	if ao_kreis==	9572
replace lm_region = 	9564000	if ao_kreis==	9573
replace lm_region = 	9564000	if ao_kreis==	9574
replace lm_region = 	9564000	if ao_kreis==	9575
replace lm_region = 	9564000	if ao_kreis==	9576
replace lm_region = 	9564000	if ao_kreis==	9577
replace lm_region = 	6412000	if ao_kreis==	9661
replace lm_region = 	9662000	if ao_kreis==	9662
replace lm_region = 	9663000	if ao_kreis==	9663
replace lm_region = 	6412000	if ao_kreis==	9671
replace lm_region = 	9662000	if ao_kreis==	9672
replace lm_region = 	9662000	if ao_kreis==	9673
replace lm_region = 	9662000	if ao_kreis==	9674
replace lm_region = 	9663000	if ao_kreis==	9675
replace lm_region = 	6412000	if ao_kreis==	9676
replace lm_region = 	9663000	if ao_kreis==	9677
replace lm_region = 	9662000	if ao_kreis==	9678
replace lm_region = 	9663000	if ao_kreis==	9679
replace lm_region = 	9162000	if ao_kreis==	9761
replace lm_region = 	9162000	if ao_kreis==	9762
replace lm_region = 	9162000	if ao_kreis==	9763
replace lm_region = 	8421000	if ao_kreis==	9764
replace lm_region = 	9162000	if ao_kreis==	9771
replace lm_region = 	9162000	if ao_kreis==	9772
replace lm_region = 	9162000	if ao_kreis==	9773
replace lm_region = 	9162000	if ao_kreis==	9774
replace lm_region = 	8421000	if ao_kreis==	9775
replace lm_region = 	8436064	if ao_kreis==	9776
replace lm_region = 	9162000	if ao_kreis==	9777
replace lm_region = 	9162000	if ao_kreis==	9778
replace lm_region = 	9162000	if ao_kreis==	9779
replace lm_region = 	9162000	if ao_kreis==	9780
replace lm_region = 	10041100	if ao_kreis==	10041
replace lm_region = 	10041100	if ao_kreis==	10042
replace lm_region = 	10041100	if ao_kreis==	10043
replace lm_region = 	10041100	if ao_kreis==	10044
replace lm_region = 	10041100	if ao_kreis==	10045
replace lm_region = 	10041100	if ao_kreis==	10046
replace lm_region = 	11000000	if ao_kreis==	11000
replace lm_region = 	11000000	if ao_kreis==	11100
replace lm_region = 	11000000	if ao_kreis==	12051
replace lm_region = 	14612000	if ao_kreis==	12052
replace lm_region = 	11000000	if ao_kreis==	12053
replace lm_region = 	11000000	if ao_kreis==	12054
replace lm_region = 	11000000	if ao_kreis==	12060
replace lm_region = 	11000000	if ao_kreis==	12061
replace lm_region = 	14612000	if ao_kreis==	12062
replace lm_region = 	11000000	if ao_kreis==	12063
replace lm_region = 	11000000	if ao_kreis==	12064
replace lm_region = 	11000000	if ao_kreis==	12065
replace lm_region = 	14612000	if ao_kreis==	12066
replace lm_region = 	11000000	if ao_kreis==	12067
replace lm_region = 	11000000	if ao_kreis==	12068
replace lm_region = 	11000000	if ao_kreis==	12069
replace lm_region = 	11000000	if ao_kreis==	12070
replace lm_region = 	14612000	if ao_kreis==	12071
replace lm_region = 	11000000	if ao_kreis==	12072
replace lm_region = 	11000000	if ao_kreis==	12073
replace lm_region = 	13001000	if ao_kreis==	13001
replace lm_region = 	13002000	if ao_kreis==	13002
replace lm_region = 	13003000	if ao_kreis==	13003
replace lm_region = 	2000000	if ao_kreis==	13004
replace lm_region = 	13001000	if ao_kreis==	13005
replace lm_region = 	2000000	if ao_kreis==	13006
replace lm_region = 	13003000	if ao_kreis==	13051
replace lm_region = 	13002000	if ao_kreis==	13052
replace lm_region = 	13003000	if ao_kreis==	13053
replace lm_region = 	2000000	if ao_kreis==	13054
replace lm_region = 	13002000	if ao_kreis==	13055
replace lm_region = 	13002000	if ao_kreis==	13056
replace lm_region = 	13003000	if ao_kreis==	13057
replace lm_region = 	2000000	if ao_kreis==	13058
replace lm_region = 	13001000	if ao_kreis==	13059
replace lm_region = 	13001000	if ao_kreis==	13071
replace lm_region = 	13001000	if ao_kreis==	13072
replace lm_region = 	13001000	if ao_kreis==	13073
replace lm_region = 	13001000	if ao_kreis==	13074
replace lm_region = 	2000000	if ao_kreis==	13060
replace lm_region = 	13001000	if ao_kreis==	13061
replace lm_region = 	13002000	if ao_kreis==	13062
replace lm_region = 	13002000	if ao_kreis==	13075
replace lm_region = 	13002000	if ao_kreis==	13076
replace lm_region = 	14511000	if ao_kreis==	14511
replace lm_region = 	14511000	if ao_kreis==	14521
replace lm_region = 	14511000	if ao_kreis==	14522
replace lm_region = 	14511000	if ao_kreis==	14523
replace lm_region = 	14511000	if ao_kreis==	14524
replace lm_region = 	14612000	if ao_kreis==	14612
replace lm_region = 	14612000	if ao_kreis==	14625
replace lm_region = 	14612000	if ao_kreis==	14626
replace lm_region = 	14612000	if ao_kreis==	14627
replace lm_region = 	14612000	if ao_kreis==	14628
replace lm_region = 	14713000	if ao_kreis==	14713
replace lm_region = 	14713000	if ao_kreis==	14729
replace lm_region = 	14713000	if ao_kreis==	14730
replace lm_region = 	14713000	if ao_kreis==	15001
replace lm_region = 	14713000	if ao_kreis==	15002
replace lm_region = 	15003000	if ao_kreis==	15003
replace lm_region = 	2000000	if ao_kreis==	15081
replace lm_region = 	14713000	if ao_kreis==	15082
replace lm_region = 	15003000	if ao_kreis==	15083
replace lm_region = 	14713000	if ao_kreis==	15084
replace lm_region = 	15085370	if ao_kreis==	15085
replace lm_region = 	15003000	if ao_kreis==	15086
replace lm_region = 	14713000	if ao_kreis==	15087
replace lm_region = 	14713000	if ao_kreis==	15088
replace lm_region = 	15003000	if ao_kreis==	15089
replace lm_region = 	15003000	if ao_kreis==	15090
replace lm_region = 	14713000	if ao_kreis==	15091
replace lm_region = 	16051000	if ao_kreis==	16051
replace lm_region = 	16051000	if ao_kreis==	16052
replace lm_region = 	16051000	if ao_kreis==	16053
replace lm_region = 	16054000	if ao_kreis==	16054
replace lm_region = 	16051000	if ao_kreis==	16055
replace lm_region = 	16051000	if ao_kreis==	16056
replace lm_region = 	3152012		if ao_kreis==	16061
replace lm_region = 	3152012		if ao_kreis==	16062
replace lm_region = 	16051000	if ao_kreis==	16063
replace lm_region = 	16051000	if ao_kreis==	16064
replace lm_region = 	16051000	if ao_kreis==	16065
replace lm_region = 	16054000	if ao_kreis==	16066
replace lm_region = 	16051000	if ao_kreis==	16067
replace lm_region = 	16051000	if ao_kreis==	16068
replace lm_region = 	16054000	if ao_kreis==	16069
replace lm_region = 	16051000	if ao_kreis==	16070
replace lm_region = 	16051000	if ao_kreis==	16071
replace lm_region = 	9463000		if ao_kreis==	16072
replace lm_region = 	16051000	if ao_kreis==	16073
replace lm_region = 	16051000	if ao_kreis==	16074
replace lm_region = 	16051000	if ao_kreis==	16075
replace lm_region = 	16051000	if ao_kreis==	16076
replace lm_region = 	14713000	if ao_kreis==	16077


// Change identifies to # 1-50
replace lm_region =  	1	if lm_region==	2000000
replace lm_region =  	2	if lm_region==	3101000
replace lm_region =  	3	if lm_region==	3152012
replace lm_region =  	4	if lm_region==	3241001
replace lm_region =  	5	if lm_region==	4011000
replace lm_region =  	6	if lm_region==	5711000
replace lm_region =  	7	if lm_region==	3403000
replace lm_region =  	8	if lm_region==	3404000
replace lm_region =  	9	if lm_region==	5515000
replace lm_region =  	10	if lm_region==	5113000
replace lm_region =  	11	if lm_region==	5313000
replace lm_region =  	12	if lm_region==	5315000
replace lm_region =  	13	if lm_region==	5970040
replace lm_region =  	14	if lm_region==	6412000
replace lm_region =  	15	if lm_region==	8222000
replace lm_region =  	16	if lm_region==	6611000
replace lm_region =  	17	if lm_region==	7111000
replace lm_region =  	18	if lm_region==	7211000
replace lm_region =  	19	if lm_region==	10041100
replace lm_region =  	20	if lm_region==	8212000
replace lm_region =  	21	if lm_region==	8111000
replace lm_region =  	22	if lm_region==	8311000
replace lm_region =  	23	if lm_region==	8317096
replace lm_region =  	24	if lm_region==	8326074
replace lm_region =  	25	if lm_region==	8335075
replace lm_region =  	26	if lm_region==	8336050
replace lm_region =  	27	if lm_region==	8421000
replace lm_region =  	28	if lm_region==	8436064
replace lm_region =  	29	if lm_region==	9162000
replace lm_region =  	30	if lm_region==	9262000
replace lm_region =  	31	if lm_region==	9362000
replace lm_region =  	32	if lm_region==	9564000
replace lm_region =  	33	if lm_region==	9363000
replace lm_region =  	34	if lm_region==	9479136
replace lm_region =  	35	if lm_region==	9462000
replace lm_region =  	36	if lm_region==	9463000
replace lm_region =  	37	if lm_region==	9464000
replace lm_region =  	38	if lm_region==	9662000
replace lm_region =  	39	if lm_region==	9663000
replace lm_region =  	40	if lm_region==	11000000
replace lm_region =  	41	if lm_region==	14612000
replace lm_region =  	42	if lm_region==	13001000
replace lm_region =  	43	if lm_region==	13002000
replace lm_region =  	44	if lm_region==	13003000
replace lm_region =  	45	if lm_region==	14511000
replace lm_region =  	46	if lm_region==	14713000
replace lm_region =  	47	if lm_region==	15003000
replace lm_region =  	48	if lm_region==	15085370
replace lm_region =  	49	if lm_region==	16051000
replace lm_region =  	50	if lm_region==	16054000
			
	
// Label values
label define lm_region_labels 	1  "Hamburg"
label define lm_region_labels 	2  "Braunschweig", modify
label define lm_region_labels 	3  "Göttingen", modify
label define lm_region_labels 	4  "Hannover", modify
label define lm_region_labels 	5  "Bremen", modify
label define lm_region_labels 	6  "Bielefeld", modify
label define lm_region_labels 	7  "Oldenburg (Oldenburg)", modify
label define lm_region_labels 	8  "Osnabrück", modify
label define lm_region_labels 	9  "Münster", modify
label define lm_region_labels 	10  "Essen", modify
label define lm_region_labels 	11  "Aachen", modify
label define lm_region_labels 	12  "Köln", modify
label define lm_region_labels 	13  "Siegen", modify
label define lm_region_labels 	14  "Frankfurt am Main", modify
label define lm_region_labels 	15  "Mannheim", modify
label define lm_region_labels 	16  "Kassel", modify
label define lm_region_labels 	17  "Koblenz", modify
label define lm_region_labels 	18  "Trier", modify
label define lm_region_labels 	19  "Saarbrücken", modify
label define lm_region_labels 	20  "Karlsruhe", modify
label define lm_region_labels 	21  "Stuttgart", modify
label define lm_region_labels 	22  "Freiburg im Breisgau", modify
label define lm_region_labels 	23  "Offenburg", modify
label define lm_region_labels 	24  "Villingen-Schwenningen", modify
label define lm_region_labels 	25  "Singen (Hohentwiel)", modify
label define lm_region_labels 	26  "Lörrach", modify
label define lm_region_labels 	27  "Ulm", modify
label define lm_region_labels 	28  "Ravensburg", modify
label define lm_region_labels 	29  "München", modify
label define lm_region_labels 	30  "Passau", modify
label define lm_region_labels 	31  "Regensburg", modify
label define lm_region_labels 	32  "Nürnberg", modify
label define lm_region_labels 	33  "Weiden i.d. OPf.", modify
label define lm_region_labels 	34  "Marktredwitz", modify
label define lm_region_labels 	35  "Bayreuth", modify
label define lm_region_labels 	36  "Coburg", modify
label define lm_region_labels 	37  "Hof", modify
label define lm_region_labels 	38  "Schweinfurt", modify
label define lm_region_labels 	39  "Würzburg", modify
label define lm_region_labels 	40  "Berlin", modify
label define lm_region_labels 	41  "Dresden", modify
label define lm_region_labels 	42  "Greifswald", modify
label define lm_region_labels 	43  "Neubrandenburg", modify
label define lm_region_labels 	44  "Rostock", modify
label define lm_region_labels 	45  "Chemnitz", modify
label define lm_region_labels 	46  "Leipzig", modify
label define lm_region_labels 	47  "Magdeburg", modify
label define lm_region_labels 	48  "Wernigerode", modify
label define lm_region_labels 	49  "Erfurt", modify
label define lm_region_labels 	50  "Suhl", modify	

label values lm_region lm_region_labels



tab lm_region, m // checking

rename lm_region region1
su region1
local maxvalregion = r(max)


forvalues n = 1/`maxvalregion' {
	global covlist1 "$covlist1 rd`n'"
}



* 5-digit occupation
su ieb_beruf_kons_num, d // checking the variable
rename ieb_beruf_kons_num occupation
tab occupation, m // checking
replace occupation = 999999999 if occupation<0 // dealing with missings that have negative values
cap noisily tab occupation, m

global covlist1 "$covlist1 i.occupation"


* industry
* these are the 21 'sections' from here https://www.klassifikationsserver.de/klassService/jsp/common/url.jsf?variant=wz2008&lang=EN
gen industry = .
replace industry = 1 if wz08_kons_num<5000 & wz08_kons_num>0 // the bit after the ampersand ensures negative values are coded to missing
replace industry = 2 if inrange(wz08_kons_num,5000,9999)
replace industry = 3 if inrange(floor(wz08_kons_num/1000),10,34) & industry==.
replace industry = 4 if floor(wz08_kons_num/1000)==35 & industry==.
replace industry = 5 if inrange(floor(wz08_kons_num/1000),36,39) & industry==.
replace industry = 6 if inrange(floor(wz08_kons_num/1000),41,43) & industry==.
replace industry = 7 if inrange(floor(wz08_kons_num/1000),45,47) & industry==.
replace industry = 8 if inrange(floor(wz08_kons_num/1000),49,53) & industry==.
replace industry = 9 if inrange(floor(wz08_kons_num/1000),55,56) & industry==.
replace industry = 10 if inrange(floor(wz08_kons_num/1000),58,63) & industry==.
replace industry = 11 if inrange(floor(wz08_kons_num/1000),64,66) & industry==.
replace industry = 12 if floor(wz08_kons_num/1000)==68 & industry==.
replace industry = 13 if inrange(floor(wz08_kons_num/1000),69,75) & industry==.
replace industry = 14 if inrange(floor(wz08_kons_num/1000),77,82) & industry==.
replace industry = 15 if floor(wz08_kons_num/1000)==84 & industry==.
replace industry = 16 if floor(wz08_kons_num/1000)==85 & industry==.
replace industry = 17 if inrange(floor(wz08_kons_num/1000),86,88) & industry==.
replace industry = 18 if inrange(floor(wz08_kons_num/1000),90,93) & industry==.
replace industry = 19 if inrange(floor(wz08_kons_num/1000),94,96) & industry==.
replace industry = 20 if inrange(floor(wz08_kons_num/1000),97,98) & industry==.
replace industry = 21 if floor(wz08_kons_num/1000)==99 & industry==.
* about 50% of the sample is in industry 3 (manufacturing) or 7 (wholesale and retail trade)

replace industry = 22 if industry==.

tab industry, m



* tenure
local unitlist "prs_id betnr"

su jahr
local minyear = r(min)
local maxyear = r(max)
forvalues yr = `minyear'/`maxyear' {
	gen dtemp`yr' = jahr==`yr'
	egen d`yr' = max(dtemp`yr'), by(`unitlist')
	drop dtemp`yr'
}


gen tenure_pre15 = 0
forvalues yr = `minyear'/2014 {
	replace tenure_pre15 = tenure_pre15+d`yr'
	drop d`yr'
}

gen tenure = 0
replace tenure = . if jahr<2015
replace tenure = tenure_pre15+d2015 if jahr==2015
replace tenure = tenure_pre15+d2015+d2016 if jahr==2016
replace tenure = tenure_pre15+d2015+d2016+d2017 if jahr==2017
replace tenure = tenure_pre15+d2015+d2016+d2017+d2018 if jahr==2018
replace tenure = tenure_pre15+d2015+d2016+d2017+d2018+d2019 if jahr==2019

drop tenure_pre15 d2015 d2016 d2017 d2018 d2019

gen tenure2 = tenure^2
gen tenure3 = tenure^3

rename tenure tenure1

* age polynomial
rename age age1
gen age2 = age1^2
gen age3 = age1^3

* now, restricting to relevant year range (which we can do, having calculated tenure)
keep if inrange(jahr,2015,2019)

* merging on firm effects
merge m:1 betnr using ${orig}/SOEP-ADIAB_7519_v1_akm_estab.dta, keep(master match) nogen keepusing(feff_2010_2017)

* restricting to EUE movers and SOEP people, both identified near the beginning of this file
keep if moversample==1 | gsoep==1

winsor feff_2010_2017, gen(firm_effect) p(0.02)

rename soep_pid pid


*** creating dummies for the categorical variables

di `maxvalregion'
forvalues n = 1/`maxvalregion' {
	gen rd`n' = region1==`n'
}

forvalues n = 1/22 {
	gen id`n' = industry==`n'
	quietly su id`n'
	if r(sum)>0 global covlist1 "$covlist1 id`n'" // in case industry==22 for no observations
}


*** adding interactions to the list of lasso covariates

* industry and region
forvalues n = 1/22 {
	quietly su id`n'
	if r(sum)>0 {
		forvalues m = 1/`maxvalregion' {
			gen ir`n'_`m' = id`n'*rd`m'
			global f "$covlist1 ir`n'_`m'"
		}
	}
}

* age and education (i.e., skills and experience)
forvalues n = 1/3 {
	forvalues m = 1/4 {
		gen ae`n'_`m' = age`n'*ed`m'
		global covlist1 "$covlist1 ae`n'_`m'"
	}
}

* saving dataset with covariate info
save "$data/machinelearning_chars_unempFT.dta", replace

di "$covlist1"



************************* calculating ML predictions ***************************



* looping over whether the time window is 2015-2019 or 2018-2019
foreach jahrrestriction in "" "1819" { 

	cap noisily {
	
	* loading dataset with EUE movers and their wage changes (recall this dataset was created by 2_coworker_wagechanges)
	use "$data/machinelearning_unempFT.dta", clear
	
	* merging on coviarates AND (crucially) adding SOEP individuals (this is the dataset we created above)
	merge 1:1 prs_id betnr jahr using "$data/machinelearning_chars_unempFT.dta", keepusing(region1 occupation industry educ age1 age2 age3 tenure1 tenure2 tenure3 gsoep ln_wage_gsoep pid firm_effect ed* rd* id* ir* ae* female firmsize1 firmsize2 firmsize3 separation_rate sdwages growth meanwageyear wz08_kons_num)
			
	* restricting to the right set of observations: EUE moves and SOEP individuals
	keep if _merge==3 | gsoep==1
	drop _merge
	
	* also restricting to relevant year range if necessary
	if "`jahrrestriction'"=="1819" keep if jahr==2018 | jahr==2019 | gsoep==1
			
	replace ln_wage = ln_wage_gsoep if gsoep==1
	
	di "year-restriction `jahrrestriction'"
				
			
	* doing the regression
	lassoregress delta_ln_wage_mover ${covlist1} if gsoep==0

	* out-of-sample prediction for GSOEP individuals
	predict p_delta_ln_wage_mover
						
	if "`jahrrestriction'"=="" {
		preserve
			cap noisily {
				keep prs_id p_delta_ln_wage_mover betnr firm_effect
				save "$data/mlpredictions_full_unempFTdelta_lncovlist1.dta", replace // saving dataset for plotting ML prediction distributions
			}
		restore
	}
	
	*** diagnostic checks in case the predicted values are weird
	su p_delta_ln_wage_mover if gsoep==0
	su p_delta_ln_wage_mover if gsoep==1

	* create list of all prediction variables
	foreach var in ln_wage firm_effect age1 {
		su `var' if gsoep==0
		su `var' if gsoep==1
	}
	
	foreach var in region1 occupation industry educ {
		cap noisily groups `var' if gsoep==0, select(f>20)
		cap noisily groups `var' if gsoep==1, select(f>20)
	}
	
	* keeping dataset with predictions for SOEP individuals
	keep if gsoep==1 & jahr==2019
	
	* for any person-year duplicates, keeping larger log wage
	egen maxwage = max(ln_wage), by(prs_id)
	drop if maxwage!=ln_wage
	sort prs_id
	quietly by prs_id: gen dup = cond(_N==1,0,_n)
	cap noisily groups dup, select(f>20)
	drop if dup>1
	
	* saving dataset 
	if "`jahrrestriction'"=="" {
		cap noisily {
			preserve
			
				rename tenure1 ml_tenure1
				rename firm_effect ml_firm_effect
				keep pid ml_tenure1 ml_firm_effect
				save "$data/machinelearning_fullcovariates_unempFT.dta", replace
			
			restore
		}
	}
		
	keep pid jahr p_delta_ln_wage_mover
	
	di "summarizing wage change variable"
	su p_delta_ln_wage_mover, d
	di _N
	
	if "`jahrrestriction'"=="1819" {
		rename p_delta_ln_wage_mover p_delta_ln_wage_mover1819
		keep pid jahr p_delta_ln_wage_mover1819
	}
	
	cap noisily tab jahr, m
		
	save "$data/machinelearning_predictions_unempFT`jahrrestriction'.dta", replace
	} // cap noisily
} // jahresrestriction

* repeating the ML procedure, but testing its accuracy using a split-sample training/evaluation procedure

use "$data/machinelearning_unempFT.dta", clear

	
merge 1:1 prs_id betnr jahr using "$data/machinelearning_chars_unempFT.dta", keepusing(region1 occupation industry educ age1 age2 age3 tenure1 tenure2 tenure3 gsoep ln_wage_gsoep pid firm_effect ed* rd* id* ir* ae* female firmsize1 firmsize2 firmsize3 separation_rate sdwages growth)

* calculating leave-out mean of coworker wage changes, as a comparison predictor
gen tempdummy = 1
bys betnr: egen mean_delta_ln_wage_mover = mean(delta_ln_wage_mover)
bys betnr: egen num_movers = sum(tempdummy)

gen coworker_changes = mean_delta_ln_wage_mover
replace coworker_changes = . if num_movers==1
replace coworker_changes = ((coworker_changes*num_movers)-delta_ln_wage_mover)/(num_movers-1) if num_movers>1

gen msg_coworker_changes = coworker_changes==.
replace coworker_changes = 1000 if coworker_changes==. // dealing with missing values using a missing-value dummy

keep if _merge==3 | gsoep==1
drop _merge

replace ln_wage = ln_wage_gsoep if gsoep==1

gen splitvar = runiform(0,1)
gen insample = splitvar<0.5
	
* doing the regression
lassoregress delta_ln_wage_mover ${covlist1} if gsoep==0 & insample==1
				
predict p_delta_ln_wage_mover		
corr delta_ln_wage_mover p_delta_ln_wage_mover if insample==0

reg delta_ln_wage_mover p_delta_ln_wage_mover if insample==0
	
gen bias2 = (delta_ln_wage_mover-p_delta_ln_wage_mover)^2
su bias2 if insample==0

cap drop bias2

* for comparison: accuracy of coworker wage changes
di "coworker prediction evaluation"

reg delta_ln_wage_mover coworker_changes, robust
	
reg delta_ln_wage_mover coworker_changes msg_coworker_changes if gsoep==0 & insample==1
predict p_delta_ln_wage_mover2

reg delta_ln_wage_mover p_delta_ln_wage_mover2 if insample==0

gen bias2 = (delta_ln_wage_mover-p_delta_ln_wage_mover2)^2
su bias2 if insample==0

* save for partial R^2 calculation
save "$data/machinelearning_partialr.dta", replace



log close
clear